201805032 - Ardıl Silan Aydın 201805016 - Beril Kartal 201805060 - Elif Yılmaz 201805017 - Neslihan Özdil 201805045 - Orhan Gazi Barak 201805040 - Yusuf Çelikkıran
Checking and improving column names
import pandas as pd
# Verisetini yükle
data = pd.read_csv("isedataset.csv")
# Sütun isimlerini yazdır
print(data.columns)
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Volume', 'Symbol',
'Predict', 'Unnamed: 8'],
dtype='object')
import pandas as pd
# Verisetini yükle
data = pd.read_csv("isedataset.csv")
# Sütun isimlerini belirle
data.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Symbol', 'Predict','Predict2']
# Sütun sil
data.drop("Predict2", axis=1, inplace=True)
# Sütun isimlerini yazdır
print(data.columns)
data.to_csv("isedataset_uptade.csv", index=False)
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Symbol', 'Predict'], dtype='object')
Converting data to numeric
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Veri setini yükle
data = pd.read_csv("isedataset_uptade.csv")
# Tarih sütununu datetime nesnesine dönüştür
data['Date'] = pd.to_datetime(data['Date'])
# Tarih özelliklerini ayır
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data.drop("Date", axis=1, inplace=True)
# LabelEncoder oluştur
label_encoder = LabelEncoder()
# Symbol sütununu label encoding uygula
data['Symbol_Encoded'] = label_encoder.fit_transform(data['Symbol'])
data.drop("Symbol", axis=1, inplace=True)
# "Predict" sütununu veri setinden çıkar
predict_column = data.pop("Predict")
# "Predict" sütununu en sona ekle
data["Predict"] = predict_column
# Sonucu yazdır
print(data.head())
# Veriyi yeni CSV dosyasına kaydet
data.to_csv("Numeric_isedataset.csv", index=False)
Open High Low Close Volume Year Month Day \ 0 27.500000 27.500000 27.500000 27.500000 262214 2023 6 15 1 30.240000 30.240000 30.240000 30.240000 1169499 2023 6 16 2 31.000000 31.100000 29.940001 29.940001 8064437 2023 6 20 3 26.959999 26.959999 26.959999 26.959999 2147415 2023 6 21 4 25.620001 27.620001 25.500000 25.940001 71898180 2023 6 22 Symbol_Encoded Predict 0 0 30.240000 1 0 29.940001 2 0 26.959999 3 0 25.940001 4 0 25.900000
Creating new features
import pandas as pd
# Veri setini yükle
data = pd.read_csv("Numeric_isedataset.csv")
# Hareketli Ortalama (Moving Average) hesaplama
data['5_Day_Average'] = data.groupby('Symbol_Encoded')['Close'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
data['10_Day_Average'] = data.groupby('Symbol_Encoded')['Close'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())
# Günlük Fiyat Değişimi (Daily Price Change) hesaplama
data['Daily_Price_Change'] = data.groupby('Symbol_Encoded')['Close'].transform(lambda x: x.pct_change())
# Hacim İstatistikleri hesaplama
data['Average_Volume'] = data.groupby('Symbol_Encoded')['Volume'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
data['Maximum_Volume'] = data.groupby('Symbol_Encoded')['Volume'].transform(lambda x: x.rolling(window=5, min_periods=1).max())
data['Minimum_Volume'] = data.groupby('Symbol_Encoded')['Volume'].transform(lambda x: x.rolling(window=5, min_periods=1).min())
# Volatilite hesaplama
data['Volatility'] = data.groupby('Symbol_Encoded')['Close'].transform(lambda x: x.rolling(window=5, min_periods=1).std())
# Önceki gün kapanış fiyatı
data['Previous_Day_Close'] = data.groupby('Symbol_Encoded')['Close'].shift(1)
# Relatif Güç Endeksi (Relative Strength Index - RSI)
def rsi(values):
delta = values.diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14, min_periods=1).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14, min_periods=1).mean()
rs = gain / loss
return 100 - (100 / (1 + rs))
data['RSI'] = data.groupby('Symbol_Encoded')['Close'].transform(rsi)
# NaN değerleri temizleme
data.dropna(inplace=True)
# Sonucu yazdır
print(data.head())
# Veriyi yeni CSV dosyasına kaydet
data.to_csv("Numeric_isedataset_withNewFeatures.csv", index=False)
Open High Low Close Volume Year Month Day \ 1 30.240000 30.240000 30.240000 30.240000 1169499 2023 6 16 2 31.000000 31.100000 29.940001 29.940001 8064437 2023 6 20 3 26.959999 26.959999 26.959999 26.959999 2147415 2023 6 21 4 25.620001 27.620001 25.500000 25.940001 71898180 2023 6 22 5 26.139999 26.600000 25.700001 25.900000 19922530 2023 6 23 Symbol_Encoded Predict 5_Day_Average 10_Day_Average \ 1 0 29.940001 28.870000 28.870000 2 0 26.959999 29.226667 29.226667 3 0 25.940001 28.660000 28.660000 4 0 25.900000 28.116000 28.116000 5 0 25.900000 27.796000 27.746667 Daily_Price_Change Average_Volume Maximum_Volume Minimum_Volume \ 1 0.099636 7.158565e+05 1169499.0 262214.0 2 -0.009921 3.165383e+06 8064437.0 262214.0 3 -0.099532 2.910891e+06 8064437.0 262214.0 4 -0.037834 1.670835e+07 71898180.0 262214.0 5 -0.001542 2.064041e+07 71898180.0 1169499.0 Volatility Previous_Day_Close RSI 1 1.937472 27.500000 100.000000 2 1.502842 30.240000 90.131601 3 1.670370 29.940001 45.514943 4 1.890048 26.959999 38.920457 5 2.139411 25.940001 38.700562
Standardization and stats
import pandas as pd
from sklearn.preprocessing import StandardScaler
# Veri setini yükle
df = pd.read_csv("Numeric_isedataset_withNewFeatures.csv")
# Sadece sayısal sütunları seç
numeric_columns = df.select_dtypes(include=['int64', 'float64'])
# Standartlaştırma işlemi için StandardScaler'ı kullan
scaler = StandardScaler()
standardized_data = scaler.fit_transform(numeric_columns)
# Standartlaştırılmış veriyi DataFrame'e dönüştür
standardized_df = pd.DataFrame(standardized_data, columns=numeric_columns.columns)
# Standartlaştırılmış veriyi CSV dosyasına kaydet
standardized_df.to_csv("standardized_isedataset.csv", index=False)
standardized_df
| Open | High | Low | Close | Volume | Year | Month | Day | Symbol_Encoded | Predict | 5_Day_Average | 10_Day_Average | Daily_Price_Change | Average_Volume | Maximum_Volume | Minimum_Volume | Volatility | Previous_Day_Close | RSI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.038546 | -0.038723 | -0.038380 | -0.038554 | -0.309809 | -0.432882 | -0.165266 | 0.043639 | -1.725953 | -0.037914 | -0.041167 | -0.044603 | 2.432918 | -0.344201 | -0.339627 | -0.339898 | -0.021871 | -0.039540 | 2.347486 |
| 1 | -0.038536 | -0.038712 | -0.038384 | -0.038558 | -0.121767 | -0.432882 | -0.165266 | 0.504852 | -1.725953 | -0.037951 | -0.041161 | -0.044597 | -0.358623 | -0.272701 | -0.208493 | -0.339898 | -0.021924 | -0.039503 | 1.833829 |
| 2 | -0.038589 | -0.038766 | -0.038423 | -0.038597 | -0.283139 | -0.432882 | -0.165266 | 0.620155 | -1.725953 | -0.037964 | -0.041170 | -0.044607 | -2.641957 | -0.280129 | -0.208493 | -0.339898 | -0.021903 | -0.039507 | -0.488495 |
| 3 | -0.038606 | -0.038757 | -0.038442 | -0.038610 | 1.619132 | -0.432882 | -0.165266 | 0.735459 | -1.725953 | -0.037964 | -0.041178 | -0.044616 | -1.069858 | 0.122608 | 1.005548 | -0.339898 | -0.021877 | -0.039548 | -0.831742 |
| 4 | -0.038599 | -0.038770 | -0.038439 | -0.038611 | 0.201631 | -0.432882 | -0.165266 | 0.850762 | -1.725953 | -0.037964 | -0.041182 | -0.044622 | -0.145135 | 0.237382 | 1.005548 | -0.299326 | -0.021847 | -0.039562 | -0.843187 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 114395 | -0.038844 | -0.039021 | -0.038681 | -0.038855 | -0.254404 | 2.310097 | -1.304996 | 0.043639 | 1.673645 | -0.038201 | -0.041483 | -0.044967 | -0.565266 | -0.226983 | -0.202115 | -0.211732 | -0.022091 | -0.039817 | 0.400878 |
| 114396 | -0.038846 | -0.039022 | -0.038683 | -0.038857 | -0.246743 | 2.310097 | -1.304996 | 0.389549 | 1.673645 | -0.038197 | -0.041484 | -0.044967 | -0.825624 | -0.224918 | -0.202115 | -0.208478 | -0.022081 | -0.039819 | -0.255045 |
| 114397 | -0.038847 | -0.039022 | -0.038683 | -0.038853 | -0.245244 | 2.310097 | -1.304996 | 0.504852 | 1.673645 | -0.038195 | -0.041484 | -0.044967 | 0.968180 | -0.234726 | -0.202115 | -0.208478 | -0.022083 | -0.039822 | 0.145345 |
| 114398 | -0.038845 | -0.039020 | -0.038681 | -0.038851 | -0.179529 | 2.310097 | -1.304996 | 0.620155 | 1.673645 | -0.038194 | -0.041485 | -0.044967 | 0.498292 | -0.221683 | -0.202115 | -0.208478 | -0.022085 | -0.039818 | 0.399393 |
| 114399 | -0.038843 | -0.039017 | -0.038678 | -0.038850 | -0.185710 | 2.310097 | -1.304996 | 0.735459 | 1.673645 | -0.038194 | -0.041484 | -0.044967 | 0.241299 | -0.237328 | -0.248774 | -0.208478 | -0.022079 | -0.039816 | 0.662611 |
114400 rows × 19 columns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Veri setini yükleyin
df = pd.read_csv("standardized_isedataset.csv")
# Her bir sütun için istatistiksel özet
stats_df = df.describe()
# Yeni CSV dosyasına kaydet
stats_df.to_csv("stats_isedataset.csv", index=False)
# İstatistiksel özet tablosunu göster
stats_df
| Open | High | Low | Close | Volume | Year | Month | Day | Symbol_Encoded | Predict | 5_Day_Average | 10_Day_Average | Daily_Price_Change | Average_Volume | Maximum_Volume | Minimum_Volume | Volatility | Previous_Day_Close | RSI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 | 1.144000e+05 |
| mean | -3.017897e-16 | 1.941060e-16 | 3.714037e-17 | -2.773163e-16 | 2.548197e-16 | 1.438000e-13 | -2.235353e-16 | -1.019144e-16 | 1.420137e-14 | -5.127382e-17 | 1.551504e-17 | 2.927525e-16 | 9.400066e-17 | 1.007125e-16 | 1.592420e-16 | -1.165544e-15 | -4.923582e-17 | -1.490928e-17 | -2.652647e-16 |
| std | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 | 1.000004e+00 |
| min | -3.892925e-02 | -3.910575e-02 | -3.876393e-02 | -3.893759e-02 | -3.417037e-01 | -4.328823e-01 | -1.589929e+00 | -1.685911e+00 | -1.725953e+00 | -3.827785e-02 | -4.157864e-02 | -4.507743e-02 | -1.601519e+01 | -3.650962e-01 | -3.618693e-01 | -3.516240e-01 | -2.210560e-02 | -3.990638e-02 | -2.857575e+00 |
| 25% | -3.881965e-02 | -3.899347e-02 | -3.865679e-02 | -3.882808e-02 | -3.279434e-01 | -4.328823e-01 | -1.020064e+00 | -8.787876e-01 | -8.744158e-01 | -3.817247e-02 | -4.145610e-02 | -4.493711e-02 | -5.932093e-01 | -3.486805e-01 | -3.456415e-01 | -3.372884e-01 | -2.207770e-02 | -3.979137e-02 | -7.005876e-01 |
| 50% | -3.862620e-02 | -3.879402e-02 | -3.846808e-02 | -3.863481e-02 | -2.855387e-01 | -4.328823e-01 | 1.196667e-01 | -7.166443e-02 | 3.322323e-03 | -3.798627e-02 | -4.123851e-02 | -4.468878e-02 | -7.462400e-02 | -2.999277e-01 | -2.970659e-01 | -2.932099e-01 | -2.202653e-02 | -3.958822e-02 | 2.558549e-02 |
| 75% | -3.812795e-02 | -3.828154e-02 | -3.798377e-02 | -3.813704e-02 | -1.276548e-01 | -4.328823e-01 | 9.744646e-01 | 8.507620e-01 | 8.876108e-01 | -3.750627e-02 | -4.068195e-02 | -4.405329e-02 | 4.997209e-01 | -1.269333e-01 | -1.225789e-01 | -1.309946e-01 | -2.186921e-02 | -3.906637e-02 | 7.296960e-01 |
| max | 1.157220e+02 | 1.156265e+02 | 1.157965e+02 | 1.157076e+02 | 2.634714e+01 | 2.310097e+00 | 1.544330e+00 | 1.773188e+00 | 1.673645e+00 | 1.110694e+02 | 1.145794e+02 | 1.090654e+02 | 1.387008e+01 | 1.893881e+01 | 1.824998e+01 | 2.278814e+01 | 1.355416e+02 | 1.218055e+02 | 2.347486e+00 |
correlation
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# final_dataset.csv dosyasını DataFrame'e yükleyin
df = pd.read_csv("standardized_isedataset.csv")
# Sayısal sütunları seçin
numeric_cols = df.select_dtypes(include=[np.number]).columns
# Korelasyon matrisini hesaplayın
correlation_matrix = df[numeric_cols].corr()
# Korelasyon matrisinin her bir öğesinin mutlak değerini alın
absolute_correlation_matrix = correlation_matrix.abs()
# Matristeki değerleri normalize edin (isteğe bağlı olarak)
normalized_correlation_matrix = absolute_correlation_matrix / absolute_correlation_matrix.max().max()
# Korelasyon haritasını çizin
plt.figure(figsize=(20, 10)) # Kutucukları büyütme
sns.heatmap(normalized_correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5, annot_kws={"size": 12}) # Kutucuk boyutunu belirleme
plt.show()
I created a separate chart for each symbol using the new features I added and the inputs that affect the output values the most. I used Orange to see which inputs had the most impact.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Veri setini yükleyin
df = pd.read_csv("standardized_isedataset.csv")
# 'Symbol_Encoded' sütunundaki benzersiz sembollerin listesini alın
symbols = df['Symbol_Encoded'].unique()
# Her sembol için girdileri ile 'Predict' sütununu karşılaştıran grafikler oluşturun
for symbol in symbols:
symbol_df = df[df['Symbol_Encoded'] == symbol]
# Sembol için girdileri seçin
input_cols = ['Close', 'Low', 'High', 'RSI','5_Day_Average','10_Day_Average','Daily_Price_Change'
,'Average_Volume','Maximum_Volume','Minimum_Volume','Volatility','Previous_Day_Close']
# Her bir girdi sütunu için grafikler oluşturun
for column in input_cols:
plt.figure(figsize=(18, 5))
# Histogram
plt.subplot(1, 5, 1)
sns.histplot(symbol_df[column], kde=True)
plt.title(f"{column} Histogram")
# Bar Grafik
plt.subplot(1, 5, 2)
sns.barplot(x='Predict', y=column, data=symbol_df)
plt.title(f"{column} vs Prediction Bar Chart - Symbol: {symbol}")
# Scatter Plot
plt.subplot(1, 5, 3)
sns.scatterplot(x=column, y='Predict', data=symbol_df)
plt.title(f"{column} vs Prediction Scatter Plot - Symbol: {symbol}")
# Violin Plot
plt.subplot(1, 5, 4)
sns.violinplot(y=column, data=symbol_df)
plt.title(f"{column} Violin Plot - Symbol: {symbol}")
# Box Plot
plt.subplot(1, 5, 5)
sns.boxplot(y=column, data=symbol_df)
plt.title(f"{column} Box Plot - Symbol: {symbol}")
plt.tight_layout()
plt.show()
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Input In [1], in <cell line: 12>() 46 plt.title(f"{column} Box Plot - Symbol: {symbol}") 48 plt.tight_layout() ---> 49 plt.show() File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/pyplot.py:368, in show(*args, **kwargs) 324 """ 325 Display all open figures. 326 (...) 365 explicitly there. 366 """ 367 _warn_if_gui_out_of_main_thread() --> 368 return _backend_mod.show(*args, **kwargs) File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib_inline/backend_inline.py:41, in show(close, block) 39 try: 40 for figure_manager in Gcf.get_all_fig_managers(): ---> 41 display( 42 figure_manager.canvas.figure, 43 metadata=_fetch_figure_metadata(figure_manager.canvas.figure) 44 ) 45 finally: 46 show._to_draw = [] File ~/opt/anaconda3/lib/python3.9/site-packages/IPython/core/display_functions.py:298, in display(include, exclude, metadata, transient, display_id, raw, clear, *objs, **kwargs) 296 publish_display_data(data=obj, metadata=metadata, **kwargs) 297 else: --> 298 format_dict, md_dict = format(obj, include=include, exclude=exclude) 299 if not format_dict: 300 # nothing to display (e.g. _ipython_display_ took over) 301 continue File ~/opt/anaconda3/lib/python3.9/site-packages/IPython/core/formatters.py:178, in DisplayFormatter.format(self, obj, include, exclude) 176 md = None 177 try: --> 178 data = formatter(obj) 179 except: 180 # FIXME: log the exception 181 raise File <decorator-gen-2>:2, in __call__(self, obj) File ~/opt/anaconda3/lib/python3.9/site-packages/IPython/core/formatters.py:222, in catch_format_error(method, self, *args, **kwargs) 220 """show traceback on failed format call""" 221 try: --> 222 r = method(self, *args, **kwargs) 223 except NotImplementedError: 224 # don't warn on NotImplementedErrors 225 return self._check_return(None, args[0]) File ~/opt/anaconda3/lib/python3.9/site-packages/IPython/core/formatters.py:339, in BaseFormatter.__call__(self, obj) 337 pass 338 else: --> 339 return printer(obj) 340 # Finally look for special method names 341 method = get_real_method(obj, self.print_method) File ~/opt/anaconda3/lib/python3.9/site-packages/IPython/core/pylabtools.py:151, in print_figure(fig, fmt, bbox_inches, base64, **kwargs) 148 from matplotlib.backend_bases import FigureCanvasBase 149 FigureCanvasBase(fig) --> 151 fig.canvas.print_figure(bytes_io, **kw) 152 data = bytes_io.getvalue() 153 if fmt == 'svg': File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/backend_bases.py:2299, in FigureCanvasBase.print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, pad_inches, bbox_extra_artists, backend, **kwargs) 2297 if bbox_inches: 2298 if bbox_inches == "tight": -> 2299 bbox_inches = self.figure.get_tightbbox( 2300 renderer, bbox_extra_artists=bbox_extra_artists) 2301 if pad_inches is None: 2302 pad_inches = rcParams['savefig.pad_inches'] File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/figure.py:1632, in FigureBase.get_tightbbox(self, renderer, bbox_extra_artists) 1629 artists = bbox_extra_artists 1631 for a in artists: -> 1632 bbox = a.get_tightbbox(renderer) 1633 if bbox is not None and (bbox.width != 0 or bbox.height != 0): 1634 bb.append(bbox) File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/axes/_base.py:4619, in _AxesBase.get_tightbbox(self, renderer, call_axes_locator, bbox_extra_artists, for_layout_only) 4617 if self.xaxis.get_visible(): 4618 try: -> 4619 bb_xaxis = self.xaxis.get_tightbbox( 4620 renderer, for_layout_only=for_layout_only) 4621 except TypeError: 4622 # in case downstream library has redefined axis: 4623 bb_xaxis = self.xaxis.get_tightbbox(renderer) File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/axis.py:1108, in Axis.get_tightbbox(self, renderer, for_layout_only) 1105 self._update_label_position(renderer) 1107 # go back to just this axis's tick labels -> 1108 ticklabelBoxes, ticklabelBoxes2 = self._get_tick_bboxes( 1109 ticks_to_draw, renderer) 1111 self._update_offset_text_position(ticklabelBoxes, ticklabelBoxes2) 1112 self.offsetText.set_text(self.major.formatter.get_offset()) File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/axis.py:1085, in Axis._get_tick_bboxes(self, ticks, renderer) 1083 def _get_tick_bboxes(self, ticks, renderer): 1084 """Return lists of bboxes for ticks' label1's and label2's.""" -> 1085 return ([tick.label1.get_window_extent(renderer) 1086 for tick in ticks if tick.label1.get_visible()], 1087 [tick.label2.get_window_extent(renderer) 1088 for tick in ticks if tick.label2.get_visible()]) File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/axis.py:1085, in <listcomp>(.0) 1083 def _get_tick_bboxes(self, ticks, renderer): 1084 """Return lists of bboxes for ticks' label1's and label2's.""" -> 1085 return ([tick.label1.get_window_extent(renderer) 1086 for tick in ticks if tick.label1.get_visible()], 1087 [tick.label2.get_window_extent(renderer) 1088 for tick in ticks if tick.label2.get_visible()]) File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/text.py:910, in Text.get_window_extent(self, renderer, dpi) 907 raise RuntimeError('Cannot get window extent w/o renderer') 909 with cbook._setattr_cm(self.figure, dpi=dpi): --> 910 bbox, info, descent = self._get_layout(self._renderer) 911 x, y = self.get_unitless_position() 912 x, y = self.get_transform().transform((x, y)) File ~/opt/anaconda3/lib/python3.9/site-packages/matplotlib/text.py:441, in Text._get_layout(self, renderer) 438 bbox = Bbox.from_bounds(xmin, ymin, width, height) 440 # now rotate the positions around the first (x, y) position --> 441 xys = M.transform(offset_layout) - (offsetx, offsety) 443 ret = bbox, list(zip(lines, zip(ws, hs), *xys.T)), descent 444 self._cached[key] = ret KeyboardInterrupt:
Since I printed the graphics separately for each symbol, I printed out a few samples and stopped them.
Model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import numpy as np
import joblib
import pandas as pd
# Veri setini yükle
data = pd.read_csv("standardized_isedataset.csv")
# Her bir sembolden eşit sayıda veri almak için veriyi sembollere göre grupla
grouped_data = data.groupby('Symbol_Encoded')
grouped_data = pd.DataFrame(grouped_data.apply(lambda x: x.sample(grouped_data.size().min()).reset_index(drop=True)))
# Bağımsız değişkenler ve hedef değişkeni ayır
X = grouped_data.drop(columns=["Predict"])
y = grouped_data["Predict"]
# Model tanımları
models = {
"Simple Linear Regression": LinearRegression(),
"Decision Tree Regression": DecisionTreeRegressor(),
"Random Forest Regression": RandomForestRegressor()
}
best_model_name = None
best_score = -1
# Modelleri değerlendir, veri setini bölebilir ve hem ortalama R^2 hem de test R^2 değerlerini yazdır
for name, model in models.items():
# Veri setini test etmek için bölelim
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Çapraz doğrulama ile modeli değerlendir
scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2')
mean_score = np.mean(scores)
if mean_score > best_score:
best_score = mean_score
best_model_name = name
# Modeli eğitim seti üzerinde eğit
model.fit(X_train, y_train)
# Test R^2 hesapla
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
# Eğitim R^2 hesapla
y_pred_train = model.predict(X_train)
r2_train = r2_score(y_train, y_pred_train)
# Sonuçları yazdır
print(f"{name}: Cross-validated R^2 scores: {scores}")
print(f"{name}: Average Cross-validated R^2 score: {np.mean(scores)}")
print(f"{name}: Test R^2 score = {r2}")
print(f"{name}: Train R^2 score = {r2_train}")
print("--------------------------------------")
# En iyi modeli seç
best_model = models[best_model_name]
# En iyi modeli eğitim seti üzerinde eğit
best_model.fit(X_train, y_train)
# En iyi modelin eğitim seti performansını hesapla
y_pred_train_best = best_model.predict(X_train)
r2_train_best = r2_score(y_train, y_pred_train_best)
# En iyi modelin test seti performansını hesapla
y_pred_best = best_model.predict(X_test)
r2_best = r2_score(y_test, y_pred_best)
# En iyi modelin adını ve performansını yazdır
print(f"The best model is: {best_model_name}, Average cross-validation R^2: {best_score}")
print(f"The best model is ({best_model_name}): Train R^2 score = {r2_train_best}, Test R^2 score = {r2_best}")
# En iyi modeli diske kaydet
joblib.dump(best_model, 'Best_Model.pkl')
Simple Linear Regression: Cross-validated R^2 scores: [0.99957118 0.99864285 0.9967277 0.99989179 0.99986738 0.99930626 0.99892652 0.99991406 0.99987594 0.99415999] Simple Linear Regression: Average Cross-validated R^2 score: 0.9986883660169404 Simple Linear Regression: Test R^2 score = 0.9750645913388394 Simple Linear Regression: Train R^2 score = 0.9996791190684872 -------------------------------------- Decision Tree Regression: Cross-validated R^2 scores: [0.98501935 0.99257975 0.97920722 0.99185704 0.99251042 0.9723447 0.98772825 0.8459658 0.98899607 0.98688656] Decision Tree Regression: Average Cross-validated R^2 score: 0.9723095155955346 Decision Tree Regression: Test R^2 score = 0.9943272051054185 Decision Tree Regression: Train R^2 score = 1.0 -------------------------------------- Random Forest Regression: Cross-validated R^2 scores: [0.9952095 0.99572558 0.99502023 0.97396329 0.99920019 0.99651583 0.99470927 0.98587237 0.9973261 0.99526298] Random Forest Regression: Average Cross-validated R^2 score: 0.9928805319411248 Random Forest Regression: Test R^2 score = 0.999330653305865 Random Forest Regression: Train R^2 score = 0.9984331463991017 -------------------------------------- The best model is: Simple Linear Regression, Average cross-validation R^2: 0.9986883660169404 The best model is (Simple Linear Regression): Train R^2 score = 0.9996791190684872, Test R^2 score = 0.9750645913388394
['Best_Model.pkl']